import pickle
from gensim import corpora, models
import os
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
import sys
from scipy import stats
from util import P_star_Symbol
import compare_auc_delong_xu
topic_number = 26
total_auc_result = []
result_folder = '../data/results/new_topic_number_26/'
five_fold_label = []
five_fold_result = []
for batch_number in range(100):
    random.seed(batch_number)
    np.random.seed(batch_number)
    tmp_folder = '../data/tmp_files/new_topic_number_26/boosting_files/congress_mayor/' + str(batch_number) + '/'
    congress_matrix = pickle.load(open(tmp_folder + 'congress_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'rb'))
    congress_name_list = pickle.load(open(tmp_folder + 'congress_name_list_4th_sample_current_only_' + str(batch_number), 'rb'))
    congress_party_affiliation_dict = pickle.load(open('congress_party_affiliation', 'rb'))
    congress_label = []
    congress_name_list_actual = []
    congress_matrix_actual = []
    for i, eachone in enumerate(congress_name_list):
        if congress_party_affiliation_dict[eachone] == 'r':
            congress_label.append(0)
            congress_name_list_actual.append(eachone)
            congress_matrix_actual.append(congress_matrix[i])
        elif congress_party_affiliation_dict[eachone] == 'd':
            congress_label.append(1)
            congress_name_list_actual.append(eachone)
            congress_matrix_actual.append(congress_matrix[i])

    mayor_matrix = pickle.load(open(tmp_folder + 'mayor_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'rb'))
    mayor_name_list = pickle.load(open(tmp_folder + 'mayor_name_list_4th_sample_current_only_' + str(batch_number), 'rb'))
    mayor_party_affiliation_dict = pickle.load(open('mayor_party_affiliation_dict', 'rb'))
    mayor_label = []
    mayor_name_list_actual = []
    mayor_matrix_actual = []
    for i, eachone in enumerate(mayor_name_list):
        if mayor_party_affiliation_dict[eachone] == 'rep':
            mayor_label.append(0)
            mayor_name_list_actual.append(eachone)
            mayor_matrix_actual.append(mayor_matrix[i])
        elif mayor_party_affiliation_dict[eachone] == 'dem':
            mayor_label.append(1)
            mayor_name_list_actual.append(eachone)
            mayor_matrix_actual.append(mayor_matrix[i])

    # 80/20 cross validation
    tmp_results = []
    all_matrix = congress_matrix_actual + mayor_matrix_actual
    all_label = [0] * len(congress_matrix_actual) + [1] * len(mayor_matrix_actual)
    all_name = congress_name_list_actual + mayor_name_list_actual
    tmp = list(zip(all_matrix, all_label))
    random.shuffle(tmp)
    all_matrix, all_label = zip(*tmp)
    total_num = len(all_matrix)
    total_five_fold_scores = []
    total_five_fold_lables = []
    for i in range(5):
        split_ratio_1 = i * 1.0 / 5
        split_ratio_2 = (i+1) * 1.0 / 5
        train_vector = all_matrix[0:int(split_ratio_1 * total_num)] + all_matrix[int(split_ratio_2 * total_num): ]
        train_label = all_label[0:int(split_ratio_1 * total_num)] + all_label[int(split_ratio_2 * total_num): ]
        test_vector = all_matrix[int(split_ratio_1 * total_num):int(split_ratio_2 * total_num)]
        test_label = all_label[int(split_ratio_1 * total_num):int(split_ratio_2 * total_num)]
        clf = LogisticRegression(class_weight='balanced')
        clf.fit(train_vector, train_label)
        temp_result = clf.predict_proba(test_vector)
        result_vec = []
        for eachone in temp_result:
            result_vec.append(eachone[1])
        total_five_fold_scores += result_vec
        total_five_fold_lables += test_label
    five_fold_label += total_five_fold_lables
    five_fold_result += total_five_fold_scores
    fpr, tpr, thresholds = roc_curve(total_five_fold_lables, total_five_fold_scores)
    total_auc_result.append(auc(fpr, tpr))
print(np.mean(total_auc_result))
auc_delong, variances_delong = compare_auc_delong_xu.delong_roc_variance(np.array(five_fold_label), np.array(five_fold_result))
print("Delong AUC, STD:")
print(auc_delong, np.sqrt(variances_delong))

# leave one out experiment  -- mayors
national_similarity_scores_all = np.zeros((100, len(mayor_matrix_actual)))
writer = open(result_folder + 'mayor_LDA_stable_test.csv', 'w')
for eachone in mayor_name_list_actual:
    writer.write(',' + eachone)
writer.write('\n')
for batch_number in range(100):
    random.seed(batch_number)
    np.random.seed(batch_number)
    tmp_folder = '../data/tmp_files/new_topic_number_26/boosting_files/congress_mayor/' + str(batch_number) + '/'
    congress_matrix = pickle.load(open(tmp_folder + 'congress_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'rb'))
    congress_name_list = pickle.load(open(tmp_folder + 'congress_name_list_4th_sample_current_only_' + str(batch_number), 'rb'))
    congress_party_affiliation_dict = pickle.load(open('congress_party_affiliation', 'rb'))
    congress_label = []
    congress_name_list_actual = []
    congress_matrix_actual = []
    for i, eachone in enumerate(congress_name_list):
        if congress_party_affiliation_dict[eachone] == 'r':
            congress_label.append(0)
            congress_name_list_actual.append(eachone)
            congress_matrix_actual.append(congress_matrix[i])
        elif congress_party_affiliation_dict[eachone] == 'd':
            congress_label.append(1)
            congress_name_list_actual.append(eachone)
            congress_matrix_actual.append(congress_matrix[i])

    mayor_matrix = pickle.load(open(tmp_folder + 'mayor_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'rb'))
    mayor_name_list = pickle.load(open(tmp_folder + 'mayor_name_list_4th_sample_current_only_' + str(batch_number), 'rb'))
    mayor_party_affiliation_dict = pickle.load(open('mayor_party_affiliation_dict', 'rb'))
    mayor_label = []
    mayor_name_list_actual = []
    mayor_matrix_actual = []
    for i, eachone in enumerate(mayor_name_list):
        if mayor_party_affiliation_dict[eachone] == 'rep':
            mayor_label.append(0)
            mayor_name_list_actual.append(eachone)
            mayor_matrix_actual.append(mayor_matrix[i])
        elif mayor_party_affiliation_dict[eachone] == 'dem':
            mayor_label.append(1)
            mayor_name_list_actual.append(eachone)
            mayor_matrix_actual.append(mayor_matrix[i])

    writer.write(str(batch_number))
    for i in range(len(mayor_matrix_actual)):
        train_vector = mayor_matrix_actual[0:i] + mayor_matrix_actual[i+1:] + congress_matrix_actual
        train_label = [0] * (len(mayor_matrix_actual) - 1) + [1] * len(congress_matrix_actual)
        test_vector = mayor_matrix_actual[i]
        tmp = list(zip(train_vector, train_label))
        random.shuffle(tmp)
        train_vector, train_label= zip(*tmp)
        clf = LogisticRegression(class_weight='balanced')
        clf.fit(train_vector, train_label)
        temp_result = clf.predict_proba([test_vector])[0][1]
        national_similarity_scores_all[batch_number][i] = temp_result
        writer.write(',' + str(temp_result))
    writer.write('\n')
writer.close()
national_similarity_scores_avg = np.mean(national_similarity_scores_all, axis=0)
result = []
for i in range(len(national_similarity_scores_avg)):
    result.append([national_similarity_scores_avg[i], mayor_name_list_actual[i]])
national_similarity_dict = {}
sort_result = sorted(result, key= lambda x:x[0])
print(sort_result[0:5])
print(sort_result[-5:])
for eachone in sort_result:
    national_similarity_dict[eachone[1]] = eachone[0]
pickle.dump(national_similarity_dict, open(result_folder + 'mayor_national_similarity_dict', 'wb'))

partisanship_scores_all = np.zeros((100, len(mayor_matrix_actual)))
for batch_number in range(100):
    random.seed(batch_number)
    np.random.seed(batch_number)
    tmp_folder = '../data/tmp_files/new_topic_number_26/boosting_files/congress_mayor/' + str(batch_number) + '/'
    mayor_matrix = pickle.load(open(tmp_folder + 'mayor_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'rb'))
    mayor_name_list = pickle.load(open(tmp_folder + 'mayor_name_list_4th_sample_current_only_' + str(batch_number), 'rb'))
    mayor_party_affiliation_dict = pickle.load(open('mayor_party_affiliation_dict', 'rb'))
    mayor_label = []
    mayor_name_list_actual = []
    mayor_matrix_actual = []
    for i, eachone in enumerate(mayor_name_list):
        if mayor_party_affiliation_dict[eachone] == 'rep':
            mayor_label.append(0)
            mayor_name_list_actual.append(eachone)
            mayor_matrix_actual.append(mayor_matrix[i])
        elif mayor_party_affiliation_dict[eachone] == 'dem':
            mayor_label.append(1)
            mayor_name_list_actual.append(eachone)
            mayor_matrix_actual.append(mayor_matrix[i])

    for i in range(len(mayor_matrix_actual)):
        train_vector = mayor_matrix_actual[0:i] + mayor_matrix_actual[i+1:]
        train_label = mayor_label[0:i] + mayor_label[i+1:]
        test_vector = mayor_matrix_actual[i]
        test_label = mayor_label[i]
        test_name = mayor_name_list_actual[i]
        tmp = list(zip(train_vector, train_label))
        random.shuffle(tmp)
        train_vector, train_label= zip(*tmp)
        clf = LogisticRegression(class_weight='balanced')
        clf.fit(train_vector, train_label)
        temp_result = clf.predict_proba([test_vector])[0][1]
        partisanship_scores_all[batch_number][i] = temp_result
total_partisanship_list = np.mean(partisanship_scores_all, axis=0)
partisanship_dict = {}
for i in range(len(mayor_name_list_actual)):
    test_name = mayor_name_list_actual[i]
    temp_result = total_partisanship_list[i]
    partisanship_dict[test_name] = temp_result
pickle.dump(partisanship_dict, open(result_folder + 'mayor_partisanship_dict', 'wb'))

writer = open(result_folder + 'congress_mayor_similarity_partisanship_topic_space_4th_full_timelimit.csv', 'w')
writer.write("twitter_handler, similarity score to Congress people, partisanship scores with word representation\n")
for each_name, partisanship_score in partisanship_dict.items():
    writer.write(each_name + ', ' + str(national_similarity_dict[each_name]) + ', ' + str(partisanship_score) + '\n')
writer.close()
